## Ryan Copus and Ryan Hübert
## Measuring How Much Judges Matter for Case Outcomes
## Journal of Law and Courts

# Note to reader: please review the README for important information about 
# software and other requirements

# Load configuration file
source(here::here("Code/_config.R"))

# Define a custom function for dummying variables
dummy <- function(df) {  
  
  NUM <- function(dataframe)dataframe[,sapply(dataframe,is.numeric)]
  FAC <- function(dataframe)dataframe[,sapply(dataframe,is.factor)]
  
  if (is.null(ncol(NUM(df)))) {
    DF <- data.frame(NUM(df), ade4::acm.disjonctif(FAC(df)))
    names(DF)[1] <- colnames(df)[which(sapply(df, is.numeric))]
  } else {
    DF <- data.frame(NUM(df), ade4::acm.disjonctif(FAC(df)))
  }
  return(DF)
}

# Create a function to determine which folds we still need to estimate
GetMissingFolds <- function(datafr, variable, any.NA = FALSE){
  if(any.NA){
    tr <- with(datafr, tapply(is.na(datafr[[variable]]), fold, any))
  } else {
    tr <- with(datafr, tapply(is.na(datafr[[variable]]), fold, all))
  }
  return(tr[tr])
}

# h2o lambda search sometimes appears to not converge
# This function provides a work around, although it is not strictly necessary and does not always work
glmWrapper <- function(mod.name, x.var, y.var, trainfr, internal.folds, alpha.val, family = "AUTO"){
  warnings_list <- list()
  result <- withCallingHandlers({
    fm <- h2o.glm(x = x.var,
                  y = y.var,
                  training_frame = trainfr,
                  model_id = mod.name,
                  nfolds = internal.folds,
                  fold_assignment = "Modulo",
                  keep_cross_validation_predictions = TRUE,
                  family = family, 
                  alpha = alpha.val, 
                  lambda_search = TRUE, # h2o searches for optimal regularization strength
                  nlambdas = 100, # how many values of lambda will h2o search
                  seed = 1)
  }, warning = function(w) {
    warnings_list <<- c(warnings_list, conditionMessage(w))
    invokeRestart("muffleWarning")  # Optional: suppress display
  })
  if(any(grepl("Reached maximum number of iterations",unlist(warnings_list)))){
    best_lam <- fm@model$lambda_best
    fm <- h2o.glm(x = x.var,
                  y = y.var,
                  training_frame = trainfr,
                  model_id = paste0(mod.name,"_2"),
                  nfolds = internal.folds,
                  fold_assignment = "Modulo",
                  keep_cross_validation_predictions = TRUE,
                  family = family,
                  alpha = alpha.val,
                  lambda_search = FALSE, 
                  lambda = best_lam, 
                  max_iterations = 100,
                  seed = 1)
  } 
  
  return(fm)
}

# Some machines cannot run XGBoost in h2o, and will need the following function
# which extracts predictions from models
getPreds <- function(model, newdataframe, results = NULL, model_cv_only = NULL){
  model_name <- deparse(substitute(model))
  
  if(grepl("xgb", model_name)){
    if(is.null(model_cv_only)){
      stop("Error!")
    }
    cv.preds <- as.data.frame(model_cv_only$pred)
    val.preds <- as.data.frame(predict(model, newdata = newdataframe))
  } else {
    cv.preds <- as.data.frame(h2o.getFrame(model@model$cross_validation_holdout_predictions_frame_id$name)[["p1"]])
    val.preds <- as.data.frame(h2o.predict(model, newdata = newdataframe)[3])
  }
  
  colnames(cv.preds)[length(colnames(cv.preds))] <- model_name
  colnames(val.preds)[length(colnames(val.preds))] <- model_name
  
  if(is.null(results)){
    results <- list(cv.preds = cv.preds, val.preds = val.preds)
  } else {
    results[["cv.preds"]] <- cbind(results[["cv.preds"]], cv.preds)
    results[["val.preds"]] <- cbind(results[["val.preds"]], val.preds)
  }
  
  return(results)
}